/*
    PonySE word segmenter
    Copyright (C) 2007-2008 PonySE

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

/**
 * PonySE word segmenter header file 
 * @file pws.h
 * @brief PonySE word segment header file
 * @version 0.0.2
 * @author chengyan
 * @date 03/07/2008 0.0.2 change name to "PonySE word segment" and change all function
 * @date 03/07/2008 0.0.1 add function ws_get_words(), by chengyan
 * @date 12/27/2007 0.0.0 created, by chengyan
 */

/*****modified by zouxin,because there is a small bug****/

/* stdc */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <memory.h>

/* STL */
#include <vector>
#include <map>
#include <string>

/* this project */
#include "pws.h"

#define LENGTH_LINEBUF     33
#define MAX_LENGTH_KEYWORD 24
#define LENGTH_KEYWORDBUF  25

/**
 * word splitter spliting result
 */
struct pws_result
{
	long length;		/* word list length */
	long * wordid;		/* word id list, only words' identification */
	const char ** words;	/* word details list */
	/*
	long * word_attr;	words attribute
	*/
};

/* maximum length of loaded keyword */
static int g_keyword_maxlength = 0;

/* dictionary map */
static std::map<std::string,long> g_map_dict;

/* wordid -> word */
/*
static std::map<long, std::string> g_map_wordid;
*/

/**
 * initialize PonySE word segmenter
 * @remark dictionary file format:
 * 	1<space>keyword1[\r]<\n>
 * 	2<space>keyword2[\r]<\n>
 * 	...
 */
int pws_initialize( const char * dict_path )
{
	char linebuf[LENGTH_LINEBUF];
	FILE * fp = 0;
	char * pos = 0, * pos_word = 0;
	long word_id = 0;
	int keyword_len = 0; /* keyword length */

	if ( dict_path==0 || *dict_path==0 )
		return -1;

	if ( (fp=fopen(dict_path,"r")) == 0 )
		return -2;

	while ( 1 )
	{
		if ( fgets(linebuf,LENGTH_LINEBUF,fp) == 0 )
			break;
		pos = linebuf + 1;
		pos_word = 0;
		while ( *pos != 0 )
		{
			if ( *pos == 32 ) /* convert "1<space>keyword1[\r]<\n>" to "1<\0>keyword1[\r]<\n>" */
			{
				*pos = 0;
				pos_word = pos+1;
			}
			if ( *pos=='\r' || *pos=='\n' ) /* clean last '\r' or '\n' */
			{
				*pos = 0;
				keyword_len = pos - pos_word;
				break;
			}
			pos++;
		}
		if ( pos_word==0 || *pos_word==0 )
			continue;

		if ( (word_id=atol(linebuf)) < 0 )
			continue;

		g_map_dict[pos_word] = word_id;
		/* g_map_wordid[word_id] = pos_word; */

		/* get keyword max length */
		if ( g_keyword_maxlength < keyword_len )
			g_keyword_maxlength = keyword_len;
	}

	fclose( fp );

	return 0;	
}

/**
 * segment content to wordid array(saved in pws_result_obj)
 */
void pws_segment( const char * content, long len, pws_result_t * pws_result_obj )
{
	char buf[LENGTH_KEYWORDBUF];
	const char * pos = 0, * end_pos = 0;
	/*****************modified by zouxin****************/
	long len_cpy = 0, len_have = 0;
	/***************************************************/
	std::map<std::string, long>::iterator iter_dict;
	std::vector<long> vec_wordid;

	/* set default value */
	pws_result_obj->length = 0;
	pws_result_obj->wordid = 0;
	pws_result_obj->words = 0;

	pos = content;
	end_pos = pos + (long)len;
	while ( pos < end_pos )
	{
		len_have = end_pos - pos;
		len_cpy = len_have<g_keyword_maxlength ? len_have : g_keyword_maxlength;
		memcpy( buf, pos, len_cpy );

		while ( len_cpy > 1 )
		{
			*(buf+len_cpy) = 0;

			iter_dict = g_map_dict.find( buf );
			if ( iter_dict != g_map_dict.end() )
			{
				vec_wordid.push_back( iter_dict->second );
				break;
			}

			len_cpy--;
		}

		pos += len_cpy;
	}

	if ( vec_wordid.empty() == true )
		return;

	pws_result_obj->length = vec_wordid.size();
	pws_result_obj->wordid = (long *)malloc( sizeof(long) * pws_result_obj->length );
	if ( pws_result_obj->wordid == 0 )
	{
		pws_result_obj->length = 0;
		return;
	}
	std::copy( vec_wordid.begin(), vec_wordid.end(), pws_result_obj->wordid );
}

/**
 * segment content to wordid array and words string array(saved in pws_result_obj)
 */
void pws_segment_full( const char * content, long len, pws_result_t * pws_result_obj )
{
	char buf[LENGTH_KEYWORDBUF];
	const char * pos = 0, * end_pos = 0;

	/************modified by zouxin*********************/
	long len_cpy = 0, len_have = 0;
	/************************************************/
	std::map<std::string, long>::iterator iter_dict;
	std::vector<long> vec_wordid;
	std::vector<const char *> vec_words;

	/* set default value */
	pws_result_obj->length = 0;
	pws_result_obj->wordid = 0;
	pws_result_obj->words = 0;

	pos = content;
	end_pos = pos + (long)len;
	while ( pos < end_pos )
	{
		len_have = end_pos - pos;
		len_cpy = len_have<g_keyword_maxlength ? len_have : g_keyword_maxlength;
		memcpy( buf, pos, len_cpy );

		while ( len_cpy > 1 )
		{
			*(buf+len_cpy) = 0;

			iter_dict = g_map_dict.find( buf );
			if ( iter_dict != g_map_dict.end() )
			{
				vec_wordid.push_back( iter_dict->second );
				vec_words.push_back( (iter_dict->first).c_str() );
				break;
			}

			len_cpy--;
		}

		pos += len_cpy;
	}

	if ( vec_wordid.empty() == true )
		return;

	pws_result_obj->length = vec_wordid.size();
	pws_result_obj->wordid = (long *)malloc( sizeof(long) * pws_result_obj->length );
	if ( pws_result_obj->wordid == 0 )
	{
		pws_result_obj->length = 0;
		return;
	}
	pws_result_obj->words = (const char **)malloc( sizeof(char*) * pws_result_obj->length );
	if ( pws_result_obj->words == 0 )
	{
		pws_result_obj->length = 0;
		free ( pws_result_obj->wordid );
		pws_result_obj->wordid = 0;
		return;
	}
	std::copy( vec_wordid.begin(), vec_wordid.end(), pws_result_obj->wordid );
	std::copy( vec_words.begin(), vec_words.end(), pws_result_obj->words );
}

/**
 * release PonySE word segmenter
 */
void pws_release()
{
	g_map_dict.clear();
}

/**
 * create a pws_result_t object
 */
pws_result_t * pws_res_create()
{
	pws_result_t * tmp_ret = (pws_result_t *)malloc( sizeof(pws_result_t) );
	if ( tmp_ret != 0 )
		memset( tmp_ret, 0, sizeof(pws_result_t) );
	return tmp_ret;
}

/**
 * only clean a pws_result_t object
 */
void pws_res_clean( pws_result_t * pws_result_obj )
{
	if ( pws_result_obj->wordid != 0 )
	{
		free( pws_result_obj->wordid );
		pws_result_obj->wordid = 0;
	}
	if ( pws_result_obj->words != 0 )
	{
		free( pws_result_obj->words );
		pws_result_obj->words = 0;
	}
	pws_result_obj->length = 0;
}

/**
 * free and destroy a pws_result_t object which created by function pws_res_create
 */
void pws_res_free( pws_result_t ** pws_result_obj )
{
	if ( pws_result_obj != 0 )
	{
		pws_res_clean( *pws_result_obj );
		free( *pws_result_obj );
		*pws_result_obj = 0;
	}
}

/**
 * get words' identification number with a ws_result_object
 */
long pws_res_wordid( const pws_result_t * pws_result_obj, long ** wordid )
{
	*wordid = pws_result_obj->wordid;
	return pws_result_obj->length;
}

/**
 * get words' identification number and word details with a pws_result_obj
 */
long pws_res_words( const pws_result_t * pws_result_obj, long ** wordid, const char *** word )
{
	if ( pws_result_obj->words == 0 )
		return 0;
	*wordid = pws_result_obj->wordid;
	*word   = pws_result_obj->words;
	return pws_result_obj->length;
}

